package com.wstuo.common.config.attachment.service;

import org.apache.log4j.Logger;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import com.wstuo.common.security.utils.AppConfigUtils;
import com.wstuo.common.util.StringUtils;

import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.xmlbeans.XmlException;

/**
 * 从附件中提取文字
 * 
 * @author QXY
 * @version from 1.8
 * 
 */
public class ExtractTextFromAttachment {

    private static final Logger LOGGER = Logger.getLogger(ExtractTextFromAttachment.class);

	private static final String ATTACHMENT_PATH = AppConfigUtils.getInstance()
			.getAttachmentPath();

	/**
	 * 根据文件路径取得HSSFWorkbook.
	 * 
	 * @param filename
	 * @return HSSFWorkbook
	 */
	private static HSSFWorkbook getHSSFWorkbook(String filename) {
		HSSFWorkbook workbook = null;
		try {
			workbook =  new HSSFWorkbook(new FileInputStream(filename));
		} catch (FileNotFoundException e) {
			LOGGER.error(e);
		} catch (IOException e) {
		    LOGGER.error(e);
		}
		return workbook;
	}

	/**
	 * 根据文件路径取得XSSFWorkbook.
	 * 
	 * @param filename
	 */
	private static XSSFWorkbook getXSSFWorkbook(String filename) {
		XSSFWorkbook xss = null;
		try {
			xss = new XSSFWorkbook(new FileInputStream(filename));
		} catch (FileNotFoundException e) {
		    LOGGER.error(e);
		} catch (IOException e) {
		    LOGGER.error(e);
		}
		return xss;
	}

	/**
	 * 根据文件路径取得XWPFDocument.
	 * 
	 * @param filename
	 */
	private static XWPFDocument getXWPFDocument(String filename) {
		XWPFDocument xwpfDocument = null;
		try {
			xwpfDocument = new XWPFDocument(new FileInputStream(filename));
		} catch (FileNotFoundException e) {
		    LOGGER.error(e);
		} catch (IOException e) {
		    LOGGER.error(e);
		}

		return xwpfDocument;
	}

	/**
	 * 根据文件路径取得XWPFDocument.
	 * 
	 * @param filename
	 */
	private static HWPFDocument getHWPFDocument(String filename) {
		HWPFDocument hwpfDocument = null;
		try {
			hwpfDocument = new HWPFDocument(new FileInputStream(filename));
		} catch (FileNotFoundException e) {
		    LOGGER.error(e);
		} catch (IOException e) {
		    LOGGER.error(e);
		}

		return hwpfDocument;
	}


	/**
	 * 从Excel2003获取文本
	 * 
	 * @param filePath
	 */
	public String getTextFromExcel2003(String filePath) {
		String str = "";
		ExcelExtractor ee = new ExcelExtractor(getHSSFWorkbook(filePath));
		try{
			if(ee!=null&&StringUtils.hasText(ee.getText())){
				str = ee.getText();
			}
		}catch (Exception e) {
			LOGGER.error(e);
		}
		return StringUtils.hasText(str)?str.trim():"";
	}

	/**
	 * 从Word2003获取文本.
	 * 
	 * @throws IOException
	 */
	public String getTextFromWord2003(String filePath) {
		String str = "";
		WordExtractor we = new WordExtractor(getHWPFDocument(filePath));
		try{
			if(we!=null&&StringUtils.hasText(we.getText())){
				str = we.getText();
			}
		}catch (Exception e) {
			LOGGER.error(e);
		}
		return StringUtils.hasText(str)?str.trim():"";
	}
	/**
	 * 读取PPT文件内容
	 * @param filename
	 * @return String
	 */
	public String getPPTDocument(String filename) {
		String result = null;
		try {
		    StringBuffer content = new StringBuffer("");
			SlideShow ss = new SlideShow(new HSLFSlideShow(new FileInputStream(
					filename)));// is 为文件的InputStream，建立SlideShow
			Slide[] slides = ss.getSlides();// 获得每一张幻灯片
			for (int i = 0; i < slides.length; i++) {
				TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容，建立TextRun
				for (int j = 0; j < t.length; j++) {
					content.append(t[j].getText());// 这里会将文字内容加到content中去
				}
				content.append(slides[i].getTitle());
			}
			result = content.toString();
		} catch (Exception ex) {
		    LOGGER.error(ex);
		}
		return result;
	}


	/**
	 * 从Excel2007获取文本
	 * 
	 * @param filePath
	 */
	public String getTextFromExcel2007(String filePath) {
		String str = "";
		XSSFExcelExtractor xe = new XSSFExcelExtractor(
				getXSSFWorkbook(filePath));
		try{
			if(xe!=null&&StringUtils.hasText(xe.getText())){
				str = xe.getText();
			}
		}catch (Exception e) {
			LOGGER.error(e);
		}
		return StringUtils.hasText(str)?str.trim():"";
	}

	/**
	 * 从Word2007获取文本.
	 */
	public String getTextFromWord2007(String filePath) {
		String str = "";
		XWPFWordExtractor xe = new XWPFWordExtractor(getXWPFDocument(filePath));
		try{
			if(xe!=null&&StringUtils.hasText(xe.getText())){
				str = xe.getText();
			}
		}catch (Exception e) {
			LOGGER.error(e);
		}
		return StringUtils.hasText(str)?str.trim():"";
	}

	/**
	 * 从PPT2007获取文本.
	 */
	public String getTextFromPPT2007(String filePath) {
	    String pptStr = null;
		try {
		    pptStr = new XSLFPowerPointExtractor(
					POIXMLDocument.openPackage(filePath)).getText();
		} catch (XmlException e) {
			LOGGER.error(e);
		} catch (OpenXML4JException e) {
		    LOGGER.error(e);
		} catch (IOException e) {
		    LOGGER.error(e);
		}catch (Exception e) {
			LOGGER.error(e);
		}

		return pptStr;
	}

	/**
	 * 从PDF文件中获取文本.
	 * 
	 * @param filePath
	 * @return String
	 */
	public String getTextFromPDF(String filePath) {

		String result = null;
		FileInputStream is = null;
		PDDocument document = null;
		try {
			is = new FileInputStream(filePath);
			PDFParser parser = new PDFParser(is);
			parser.parse();
			document = parser.getPDDocument();
			PDFTextStripper stripper = new PDFTextStripper();
			result = stripper.getText(document);
		} catch (FileNotFoundException e) {
			LOGGER.error(e);
		} catch (IOException e) {
			LOGGER.error(e);
		} finally {
		    try {
		    	if (is != null) {
		    		 is.close();
		    	}
		    	if (document != null) {
		    		document.close();
		    	}
            } catch (IOException e) {
                LOGGER.error(e);
                
			}
		}
		return result;
	}

	/**
	 * 获取文件编码格式.
	 * 
	 * @param fileName
	 *            :fileName
	 * @return String 文件编码格式
	 * @throws IOException
	 */
	private String getCharset(String fileName) throws IOException {

		BufferedInputStream bin = new BufferedInputStream(new FileInputStream(
				fileName));
		int p = (bin.read() << 8) + bin.read();

		String code = null;

		switch (p) {
		case 0xefbb:
			code = "UTF-8";
			break;
		case 0xfffe:
			code = "Unicode";
			break;
		case 0xfeff:
			code = "UTF-16BE";
			break;
		default:
			code = "GBK";
		}
		return code;
	}

	/**
	 * 从TEXT中获取文本.
	 * 
	 * @param filePath
	 * @return String
	 */
	public String getTextFromText(String filePath) {
	    String result = null;
	    InputStreamReader isr = null ;
	    BufferedReader br =null ;
	    StringBuffer sb = null;
		try {
			isr = new InputStreamReader(new FileInputStream(
					filePath), getCharset(filePath));
			br = new BufferedReader(isr);

			sb = new StringBuffer();
			String temp = null;
			while ((temp = br.readLine()) != null) {
				sb.append(temp);
			}
			
			result = sb.toString();
		} catch (FileNotFoundException e) {
		    LOGGER.error(e);
		} catch (IOException e) {
			LOGGER.error(e);
		} finally {
			try {
				if(br!=null){
					br.close();
				}
				if(isr!=null){
					isr.close();
				}
			} catch (IOException e) {
				LOGGER.error(e);
			}
		}
		return result;
	}

	/**
	 * 从完整路径中取得.
	 * 
	 * @param filePath
	 * @return String
	 */
	public String getTextFromFullFilePath(String filePath) {
	    String result = null;
		// 取得文件的后缀
		String fileType = filePath.substring(filePath.lastIndexOf(".") + 1,
				filePath.length()).toLowerCase();

		if ("doc".equals(fileType)) {// Word2003
		    result = getTextFromWord2003(filePath);
		}

		if ("docx".equals(fileType)) {// Word2007
		    result = getTextFromWord2007(filePath);
		}

		if ("xls".equals(fileType)) {// Excel2007
		    result = getTextFromExcel2003(filePath);
		}

		if ("xlsx".equals(fileType)) {// Excel2007
		    result = getTextFromExcel2007(filePath);
		}

		if ("pdf".equals(fileType)) {
		    result = getTextFromPDF(filePath);
		}

		if ("ppt".equals(fileType)) {
		    result = getPPTDocument(filePath);
		}

		 if("pptx".equals(fileType)){
		     // 读取PPTX文件的方法存在问题，暂时注释(tan:20120830)
		     result = getTextFromPPT2007(filePath);
		 }

		if ("txt".equals(fileType) || "csv".equals(fileType)
				|| "html".equals(fileType) // 预留一些类型
				|| "htm".equals(fileType) || "java".equals(fileType)) {

		    result = getTextFromText(filePath);
		}

		return result;
	}

	/**
	 * 获取附件内容
	 * @return String
	 */
	public String getTextFromAttcahment(String filePath) {
		String fullFilePath = ATTACHMENT_PATH
				+ "/" + filePath;
		return getTextFromFullFilePath(fullFilePath);

	}

}
